/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.db; import java.io.*; import java.util.*; import net.nutch.pagedb.*; import net.nutch.linkdb.*; /****************************************** * IWebDBWriter is an interface to the consolidated * page/link database. It permits certain kinds of * operations. * * This database may be implemented in several different * ways (single or muli-pass, single-machine or distributed). * The user of this interface has no idea which one is * being used. They all commit to the IWebDBWriter contract. * * Note that changes to a webdb are finalized upon the call * to close(). Before the call to close() returns, any * readers of the database should see the db in a pristine * pre-write state. * * @author Mike Cafarella ******************************************/ public interface IWebDBWriter { /** * Flush and complete all writes to the db. */ public void close() throws IOException; /** * addPage(Page page) will insert a Page object into the webdb. * If the Page already exists, the existing one will be overwritten. * (Except for the link analysis score, which we try to attach to * a given URL. If an existing Page is overwritten, we will retain * the link score.) * * Page objects are uniquified by their URLs. It's fine to have * many Pages with different URLs but identical MD5s. (Indeed, * that happens all the time with duplicated pages.) But every * Page in the db must have its own URL. */ public void addPage(Page page) throws IOException; /** * addPageWithScore(Page page) inserts a Page into the webdb. * It works just like the above function, except that link scores * are not preserved if the inserted object already exists. The * inserted object's score will replace one that may already be there. * * This function is useful for the Link Analysis program. */ public void addPageWithScore(Page page) throws IOException; /** * addPageIfNotPresent(Page) works just like addPage(), except that * the insertion will not take place if there is already a Page with * that URL in the webdb. In that case, the call to addPage() is * simply ignored. */ public void addPageIfNotPresent(Page page) throws IOException; /** * addPageIfNotPresent(Page, Link) works just like the above addPage(), * except that a Link is also conditionally added to the webdb. */ public void addPageIfNotPresent(Page page, Link link) throws IOException; /** * deletePage(url) will remove a Page object from the db with the * given URL. Fails silently if there is no Page with the given URL. */ public void deletePage(String url) throws IOException; /** * addLink(Link) will add the given Link to the webdb. If the * Link already exists, the existing one will be overwritten. * * Links are uniquified by both source MD5 and target URL. * Two Links are considered identical only if they match both * fields. * * Links are only permitted in the webdb if they have a valid * source MD5 for a Page that is also in the webdb. When a * Page is removed, the webdb will automatically remove Links * as appropriate. * * (Note that since there can be multiple URLs with identical * content, the webdb basically needs to do reference-counting * for each Link's source-MD5.) */ public void addLink(Link link) throws IOException; }